from pyhanlp import *
import gc


def group_consecutives(vals, step=1):
    """Return list of consecutive lists of numbers from vals (number list)."""
    run = []
    result = [run]
    expect = None
    for v in vals:
        if (v == expect) or (expect is None):
            run.append(v)
        else:
            run = [v]
            result.append(run)
        expect = v + step
    return result


def find_word(text, words, del_status):
    CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
    if len(words) != 0:
        for element in words:
            CustomDictionary.add(element)

    sentence = HanLP.parseDependency(text)
    data = []
    word_array = sentence.getWordArray()
    word_cut = []
    status = []
    location = []
    # print(seg_text)
    # print(len(word_array))
    for i, word in enumerate(word_array):
        # print("%s --(%s)--> %s" % (word.LEMMA, word.DEPREL, word.HEAD.LEMMA))
        word_cut.append(word.LEMMA)
        # if word.DEPREL == "定中关系" or word.DEPREL == "动宾关系":
        #     if word.DEPREL == "动宾关系" and word.LEMMA+word.HEAD.LEMMA not in text:
        #         data.append([i, word.HEAD.LEMMA, word.DEPREL, word.LEMMA])
        #     else:
        #         data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
        if word.DEPREL == "定中关系":
            data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
        elif word.DEPREL == "右附加关系":
            status.append([word.HEAD.LEMMA, word.HEAD.LEMMA + word.LEMMA])
    if status:
        for element in data:
            for element_one in status:
                if element_one[0] in element[1] and element_one[1] + element[3] in text:
                    element[1] = element_one[1]
    for element in data:
        if element[1] + element[3] not in text:
            continue
        else:
            location.append(element[0])
    location_split = group_consecutives(location)
    result = []
    if location_split is not []:
        for children in location_split:
            ans_data = ""
            if len(children) > 1:
                for child in children:
                    for element in data:
                        if element[0] == child:
                            ans_data = ans_data + element[1]

                for element in data:
                    if element[0] == children[len(children) - 1]:
                        ans_data = ans_data + element[3]
            elif data is not [] and children != []:
                for element in data:
                    if element[0] == children[0]:
                        ans_data = element[1] + element[3]
            if ans_data:
                result.append(ans_data)
    if del_status == 0:
        del words
        del sentence
        del word_array
        for x in locals().keys():
            del locals()[x]
            gc.collect()
    return result, word_cut


def text_get(text, words):
    f = "add_words.txt"
    with open(f, encoding="utf8") as f_data:
        all_text = f_data.readlines()
        f_data.close()
    with open(f, "a+") as file:
        if words != []:
            for i in words:
                temp = str(i) + "\n"
                if temp not in all_text:
                    file.write(temp)
    file.close()

    # words = []
    seg_data = []
    count = 0
    new_word = []
    while True:
        result, word_cut = find_word(text, words, 1)
        seg_data.append(word_cut)
        words.extend(result)
        new_word.append([result, count])
        count = count + 1
        if len(result) == 0 or count > 6:
            break
    for x in locals().keys():
        del locals()[x]
        gc.collect()
    CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
    for element in words:
        CustomDictionary.remove(element)
    return new_word, seg_data

# print(text_get("症状80%以上的患者在短时间内突然发生呼吸困难、烦躁不安、多汗、心悸、胸痛。",["抗凝药物"]))
